##R code for models and analysis reported on in Watling et al. (2012) Ecological Modelling 246:79--85## 
##Code for random forest models##
library(randomForest) ##Load R libraries for analysis##
library(RSAGA)
#To create master prediction map#
data <-read.table("C:/BioClim/American_Crocodile/croc_acut_full.txt", header=T) ##Add and attach file of climate data from grid cells indicated by georeferenced presences and pseudo-absences##
attach(data)
test <-randomForest(as.factor(species1)~BIO2+BIO5+BIO6+BIO8+BIO14+BIO15+BIO16+BIO18+BIO19, ntree=500, importance=T) ##Run the random forest model##
var1pred <-scan("C:/BioClim/American_Crocodile/bio2.asc", skip=6) ##Load climate grids for each variable included in the model##
var2pred <-scan("C:/BioClim/American_Crocodile/bio5.asc", skip=6)
var3pred <-scan("C:/BioClim/American_Crocodile/bio6.asc", skip=6)
var4pred <-scan("C:/BioClim/American_Crocodile/bio8.asc", skip=6)
var5pred <-scan("C:/BioClim/American_Crocodile/bio14.asc", skip=6)
var6pred <-scan("C:/BioClim/American_Crocodile/bio15.asc", skip=6)
var7pred <-scan("C:/BioClim/American_Crocodile/bio16.asc", skip=6)
var8pred <-scan("C:/BioClim/American_Crocodile/bio18.asc", skip=6)
var9pred <-scan("C:/BioClim/American_Crocodile/bio19.asc", skip=6)
var1pred[var1pred == -9999] = NA ##Assign N/A values from the climate grids##
var2pred[var2pred == -9999] = NA
var3pred[var3pred == -9999] = NA
var4pred[var4pred == -9999] = NA
var5pred[var5pred == -9999] = NA
var6pred[var6pred == -9999] = NA
var7pred[var7pred == -9999] = NA
var8pred[var8pred == -9999] = NA
var9pred[var9pred == -9999] = NA
predict.data <-data.frame(var1pred, var2pred, var3pred, var4pred, var5pred, var6pred, var7pred, var8pred, var9pred) ##Create a data frame with all the climate data##
names(predict.data) <-c("BIO2", "BIO5", "BIO6", "BIO8", "BIO14", "BIO15", "BIO16", "BIO18", "BIO19") ##Assign names to climate variables in newly-created data frame, being sure to use the same names as used in the random forest model run above##
pred.1 <-predict(test, predict.data, type="prob", se.fit=F)  ##Extrapolate the model ('test') based on the climate conditions represented in the 'predict.data' data frame##  
pred.2 <- pred.1[,2] ##Extract the cell-by-cell probabilities from the extrapolation file 'pred.1'##
pred.3 <-na.exclude(pred.2)  ##Exclude the N/A values from the prediction##
dummy.null <-c(var1pred) ##Create a dummy vector with as many observations as the full climate grids used for extrapolation##
dummy.null[is.na(dummy.null)]<- -9999 ##Assign the N/A values in the dummy vector as -9999##
new <-replace(dummy.null, dummy.null > -9999, pred.3)  ##Replace the non-N/A values in the dummy vector with the non-N/A predictions from the extrapolation file pred.1##
write.table(new, file="C:/BioClim/Revision_Nov_2011/BioClim/American_Crocodile/temp.txt") ##Save the dummy vector as a data file in vector form##
data <-read.table("C:/BioClim/Revision_Nov_2011/BioClim/American_Crocodile/temp.txt", header=T) ##Read the vector data back into R##
data.2 <-data[,1] ##Extract the probabilities from the vector file##
hdr = list(ncols=447, nrows=449, xllcorner= -109.1871509552, yllcorner=-32.339998245239, cellsize=0.167, nodata_value = -9999) ##Create a header row of metadata for the prediction map, based on the data from the climate asciis 'var1pred', etc##
data.mat <-matrix(data.2, nrow=449, ncol=447, byrow=T) ##Format the vector file as a matrix with row and column numbers as in the climate ascii files##
write.ascii.grid(data.mat, "C:/BioClim/Revision_Nov_2011/BioClim/American_Crocodile/bioclim_pred_map.asc", header= hdr, digits = 6, georef = "corner") ##Save the prediction map as an ascii file##

##The code below calculates AUC and Cohen's kappa for model validation based on 100 random partitions of the occurrence data##
library(PresenceAbsence)##Load additional R libraries##
library(e1071)

random.data <-read.table("C:/BioClim/American_Crocodile/random_points.txt", header=T) ##Add file of climate data from grid cells representing randomly-selected pseudo-absences##
data.full <-read.table("C:/BioClim/American_Crocodile/Bioclim_files_July_2012/american_crocodile_all.txt", header=T) ##Add and attach files of climate data from grid cells representing species presences##
attach(data.full)
x<-seq(1, 100, by= 1) ##create sequence of 100 items##
for (j in x){ ##Set up loop to repeat procedure 100 times##
random <-random.data[sample((1:10000),10000),] ##Randomize the pseudo-absence data##
random.train <-random[1:7500,] ##Create a training subset with 75% of the pseudo-absence data##
random.test <-random[7501:10000,] ##Create a test subset with 25% of the pseudo-absence data##
subset <-data.full[sample((1:120),120),] ##Randomize the presence data##
pres.train <-subset[1:90,] ##Create a training subset with 75% of the presence data##
pres.test <-subset[91:120,] ##Create a test subset with 25% of the presence data##
train.data <-rbind(pres.train, random.train) ##Append the training presence subset to a 75% training subset of the pseudo-absence data## 
test.data <-rbind(pres.test, random.test) ##Append the test presence subset to a 25% test subset of the pseudo-absence data##

rf.model <-randomForest(as.factor(species)~BIO2+BIO5+BIO6+BIO8+BIO14+BIO15+BIO16+BIO18+BIO19, ntree=500, importance=T, data=train.data) ##Run random forest model with training data##
rf.predict <-predict(rf.model, test.data, type="prob") 
rf.predict.2 <-rf.predict[,2] ##Extract the cell by cell probabilities##  

##Routine to calculate AUC##
roc <-function (test.data, rf.predict.2){
if (length(test.data) != length(rf.predict.2)) 
stop("obs and preds must be equal lengths")
n.x <- length(test.data[test.data == 0])
n.y <- length(test.data[test.data == 1])
xy <- c(rf.predict.2[test.data == 0], rf.predict.2[test.data == 1])
rnk <- rank(xy)
roc <- ((n.x * n.y) + ((n.x * (n.x + 1))/2) - sum(rnk[1:n.x]))/(n.x * n.y)
}
AUC.output <-roc(test.data[,1], rf.predict.2) ##AUC value##

mat.1<-data.frame(1:2530, test.data[,1], rf.predict.2) ##Create data frame with probabilistic predictions from the model##
conmat.b <- cmx(mat.1, threshold=0.15) ##Create a confusion matrix based on assigned threshold, calculated using code below##
kappa.1<-classAgreement(conmat.b) ##Calculate kappa##
kappa.2<-kappa.1["kappa"] ##Extract kappa value##
capture.output(kappa.2, file="C:/BioClim/Revision_Nov_2011/Bioclim/American_Crocodile/kappa_test_july_2012.txt", append=T) ##Save kappa and AUC values to file##
capture.output(AUC.output, file="C:/BioClim/Revision_Nov_2011/Bioclim/American_Crocodile/AUC_july_2012.txt", append=T)
if (j== 100) break
}

##Code to calculate threshold for threshold-dependent metrics by calculating five replicate estimates of kappa for each 0.01 change in threshold between 0 and 1##

x<-seq(.01, 0.99, by=0.01)##Create a vector of probabilities from 0.01 to 0.99)##
##Routine to calculate five replicate estimates of kappa for each 0.01 change in the threshold between 0.01 and 0.99##
for (j in x){
filename <-"dataset.txt"
for (i in 1:5) {

subset <-data.full[sample((1:120),120),] ##Randomize the presence data##
training <-subset[1:90,] ##Create a training subset with 75% of the presence data##
test <-subset[91:120,] ##Create a test subset with 25% of the presence data##
random.subset <-random.data[sample((1:10000),10000),] ##Randomize the pseudo-absence data##
training.data <-rbind(training, random.subset[1:7500,]) ##Append the training presence subset to a 75% training subset of the pseudo-absence data## 
test.data <-rbind(test, random.subset[7501:10000,]) ##Append the test presence subset to a 25% test subset of the pseudo-absence data##
  
rf.model <-randomForest(as.factor(species)~temp_apr+temp_dec+temp_feb+temp_jan+temp_mar+temp_may+temp_nov+temp_oct+temp_sep, ntree=500, importance=T, data=training.data) ##Run random forest model with training data##
rf.predict <-predict(rf.model, test.data, type="prob")##Create prediction with the test data##
rf.predict.2 <-rf.predict[,2]##Extract the cell by cell probabilities##  
mat.1<-data.frame(1:2530, test.data[,1], rf.predict.2)##Create data frame with probabilistic predictions from the model##
conmat.b <- cmx(mat.1, threshold= j) ##Create a confusion matrix based on assigned threshold##
kappa.1<-classAgreement(conmat.b) ##Calculate kappa##
kappa.2<-kappa.1["kappa"]##Extract kappa value##
capture.output(kappa.2, file="C:/BioClim/Revision_Nov_2011/Bioclim/American_Crocodile/kappa_threshold.txt", append=T)##Save kappa value to file##
if (i ==5) break
}
if (j== 0.99) break
}

##The code above can be used to create models and prediction maps for monthly and bioclimate predictor variables##
##Code to calculate spatial correlation between monthly and bioclimate prediction maps##

bio<-scan("C:/BioClim/Revision_Nov_2011/BioClim/American_Crocodile/bioclim_pred_map.asc", skip=6) ##Scan bioclimate prediction map##
bio.col <-c(bio) ##Reorganize data as vector##
bio.col[bio.col==-9999] = NA ##Assign N/A values##
mon<-scan("C:/BioClim/Revision_Nov_2011/Monthly/American_Crocodile/bioclim_pred_map.asc", skip=6) ##Scan monthly prediction map##
mon.col<-c(mon) ##Reorganize data as vector##
mon.col[mon.col==-9999] = NA ##Assign N/A values##
spatial.cor <-cor(bio.col, mon.col, use="na.or.complete") ##Calculate spatial correlation##
spatial.cor ##Return spatial correlation##


##The code below is as above, but for GLM models##

#To create master prediction map#
data <-read.table("C:/BioClim/American_Crocodile/croc_acut_full.txt", header=T) ##Add and attach file of climate data from grid cells indicated by georeferenced presences and pseudo-absences##
attach(data)
test <-glm(as.factor(species1)~BIO2+BIO5+BIO6+BIO8+BIO14+BIO15+BIO16+BIO18+BIO19, family="binomial") ##Run the GLM model##
var1pred <-scan("C:/BioClim/American_Crocodile/bio2.asc", skip=6) ##Load climate grids for each variable included in the model##
var2pred <-scan("C:/BioClim/American_Crocodile/bio5.asc", skip=6)
var3pred <-scan("C:/BioClim/American_Crocodile/bio6.asc", skip=6)
var4pred <-scan("C:/BioClim/American_Crocodile/bio8.asc", skip=6)
var5pred <-scan("C:/BioClim/American_Crocodile/bio14.asc", skip=6)
var6pred <-scan("C:/BioClim/American_Crocodile/bio15.asc", skip=6)
var7pred <-scan("C:/BioClim/American_Crocodile/bio16.asc", skip=6)
var8pred <-scan("C:/BioClim/American_Crocodile/bio18.asc", skip=6)
var9pred <-scan("C:/BioClim/American_Crocodile/bio19.asc", skip=6)
var1pred[var1pred == -9999] = NA ##Assign N/A values from the climate grids##
var2pred[var2pred == -9999] = NA
var3pred[var3pred == -9999] = NA
var4pred[var4pred == -9999] = NA
var5pred[var5pred == -9999] = NA
var6pred[var6pred == -9999] = NA
var7pred[var7pred == -9999] = NA
var8pred[var8pred == -9999] = NA
var9pred[var9pred == -9999] = NA
predict.data <-data.frame(var1pred, var2pred, var3pred, var4pred, var5pred, var6pred, var7pred, var8pred, var9pred) ##Create a data frame with all the climate data##
names(predict.data) <-c("BIO2", "BIO5", "BIO6", "BIO8", "BIO14", "BIO15", "BIO16", "BIO18", "BIO19")  ##Assign names to climate variables in newly-created data frame, being sure to use the same names as used in the random forest model run above##
pred.1 <-predict(test, predict.data, type="link", se.fit=F)  ##Extrapolate the model ('test') based on the climate conditions represented in the 'predict.data' data frame##  
pred.2 <-exp(pred.1)/(1+exp(pred.1)) ##Rescale GLM predictions##
pred.3 = na.exclude(pred.2) ##Exclude N/A data from predictions##
dummy.null <-c(var1pred)  ##Create a dummy vector with as many observations as the full climate grids used for extrapolation##
dummy.null[is.na(dummy.null)]<- -9999 ##Assign the N/A values in the dummy vector as -9999##
new <-replace(dummy.null, dummy.null > -9999, pred.3) ##Replace the non-N/A values in the dummy vector with the non-N/A predictions from the extrapolation file pred.1##
write.table(new, file="C:/BioClim/Revision_Nov_2011/GLM/BioClim/American_Crocodile/temp.txt") ##Save the dummy vector as a data file in vector form##
data <-read.table("C:/BioClim/Revision_Nov_2011/GLM/BioClim/American_Crocodile/temp.txt", header=T)  ##Read the vector data back into R##
data.2 <-data[,1] ##Extract the probabilities from the vector file##
hdr = list(ncols=447, nrows=449, xllcorner= -109.1871509552, yllcorner=-32.339998245239, cellsize=0.167, nodata_value = -9999)  ##Create a header row of metadata for the prediction map, based on the data from the climate asciis 'var1pred', etc##
data.mat <-matrix(data.2, nrow=449, ncol=447, byrow=T) ##Format the vector file as a matrix with row and column numbers as in the climate ascii files##
write.ascii.grid(data.mat, "C:/BioClim/Revision_Nov_2011/GLM/BioClim/American_Crocodile/bioclim_pred_map.asc", header= hdr, digits = 6, georef = "corner") ##Save the prediction map as an ascii file##

##The code below calculates AUC and Cohen's kappa for model validation based on 100 random partitions of the occurrence data##
library(PresenceAbsence)##Load additional R libraries##
library(e1071)

random.data <-read.table("C:/BioClim/American_Crocodile/random_points.txt", header=T)  ##Add file of climate data from grid cells representing randomly-selected pseudo-absences##
data.full <-read.table("C:/BioClim/American_Crocodile/Bioclim_files_July_2012/american_crocodile_all.txt", header=T) ##Add and attach files of climate data from grid cells representing species presences##
x <-seq(1, 100, by=1) ##create sequence of 100 items##
for (j in x){ ##Set up loop to repeat procedure 100 times##
random <-random.data[sample((1:10000),10000),] ##Randomize the pseudo-absence data##
random.train <-random[1:7500,] ##Create a training subset with 75% of the pseudo-absence data##
random.test <-random[7501:10000,] ##Create a test subset with 25% of the pseudo-absence data##
subset <-data.full[sample((1:120),120),] ##Randomize the presence data##
pres.train <-subset[1:90,] ##Create a training subset with 75% of the presence data##
pres.test <-subset[91:120,] ##Create a test subset with 25% of the presence data##
train.data <-rbind(pres.train, random.train) ##Append the training presence subset to a 75% training subset of the pseudo-absence data## 
test.data <-rbind(pres.test, random.test) ##Append the test presence subset to a 25% test subset of the pseudo-absence data##

dataset.glm <-glm(species~BIO2+BIO5+BIO6+BIO8+BIO14+BIO15+BIO16+BIO18+BIO19, family="binomial", data=train.data) ##Run the GLM model##
glm.predict <-predict(dataset.glm, test.data, type="response") ##Create prediction with the test data##

##Routine to calculate AUC##
roc <-function (test.data, glm.predict){
if (length(test.data) != length(glm.predict))
        stop("obs and preds must be equal lengths")
    n.x <- length(test.data[test.data == 0])
    n.y <- length(test.data[test.data == 1])
    xy <- c(glm.predict[test.data == 0], glm.predict[test.data == 1])
    rnk <- rank(xy)
    roc <- ((n.x * n.y) + ((n.x * (n.x + 1))/2) - sum(rnk[1:n.x]))/(n.x * n.y)
}
AUC.output <-roc(test.data[,1], glm.predict)  ##AUC value##

mat.1 <-data.frame(1:2530, test.data[,1], glm.predict) ##Create data frame with probabilistic predictions from the model##
conmat.b <- cmx(mat.1, threshold=0.12) ##Create a confusion matrix based on assigned threshold, calculated using code below##
kappa.1<-classAgreement(conmat.b) ##Calculate kappa##
kappa.2<-kappa.1["kappa"] ##Extract kappa value##
capture.output(kappa.2, file="C:/BioClim/Revision_Nov_2011/GLM/Bioclim/American_Crocodile/kappa_test_july_2012.txt", append=T) ##Save kappa and AUC values to file##
capture.output(AUC.output, file="C:/BioClim/Revision_Nov_2011/GLM/Bioclim/American_Crocodile/AUC_july_2012.txt", append=T)
if (j== 100) break
}


##Code to calculate threshold for threshold-dependent metrics by calculating five replicate estimates of kappa for each 0.01 change in threshold between 0 and 1##


x<-seq(.01, 0.99, by=0.01)##Create a vector of probabilities from 0.01 to 0.99)##
##Routine to calculate five replicate estimates of kappa for each 0.01 change in the threshold between 0.01 and 0.99##
for (j in x){
filename <-"dataset.txt"
for (i in 1:5) {

subset <-data.full[sample((1:120),120),] ##Randomize the presence data##
training <-subset[1:90,] ##Create a training subset with 75% of the presence data##
test <-subset[91:120,] ##Create a test subset with 25% of the presence data##
random.subset <-random.data[sample((1:10000),10000),] ##Randomize the pseudo-absence data##
training.data <-rbind(training, random.subset[1:7500,]) ##Append the training presence subset to a 75% training subset of the pseudo-absence data## 
test.data <-rbind(test, random.subset[7501:10000,]) ##Append the test presence subset to a 25% test subset of the pseudo-absence data##

dataset.glm <-glm(species~BIO2+BIO5+BIO6+BIO8+BIO14+BIO15+BIO16+BIO18+BIO19, family="binomial", data=training.data) ##Run random forest model with training data##
glm.predict <-predict(dataset.glm, test.data, type="response") ##Create prediction with the test data##

mat.1 <-data.frame(1:9961, test.data[,1], glm.predict) ##Create data frame with probabilistic predictions from the model##
conmat <- cmx(mat.1, threshold= j) ##Create a confusion matrix based on assigned threshold##
kappa.1<-classAgreement(conmat) ##Calculate kappa##
kappa.2<-kappa.1["kappa"] ##Extract kappa value##
capture.output(kappa.2, file="C:/BioClim/Revision_Nov_2011/GLM/Monthly/American_Crocodile/kappa_test_threshold.txt", paste(i,j, kappa.2),append=T) ##Save kappa value to file##
if (i ==5) break
}
if (j== 0.99) break
}

##Code for calculating spatial correlation between GLM prediction maps is same as for random forest maps above##